## This script pulls in the original Harris County court data and generates a dataset of first-time cases for the specified time period.
## Ariel White (replication code for "Misdemeanor Disenfranchisement?")
## April 2018

rm(list=ls())
setwd("/home/ariel/Dropbox (MIT)/Texas/Harris_fullsentencing")

library(foreign)

test2 <- read.csv("Harrisfull_nov14.csv", stringsAsFactors=F, header=F)
top <- read.csv("/home/ariel/Dropbox (MIT)/Texas/HarrisCounty_sentencing/harrisco_colnames.csv")
colnames(test2) <- top$x

test2$fyear <- substr(test2$fda, 0, 4)
library(lubridate)
test2$filedate<-  ymd(test2$fda)

library(data.table)
harris <- data.table(test2)
rm(test2)

##########################################################################
#sort within def_spn by date/case number
#determine what each person's earliest case number is and drop all later cases for that person.
# so keep rows that are:
# 1. the first case filed
# 2. filed the same day as the first case filed, or
# 3. the same case # as the first case filed.

#first generate a unique SPN for few people who weirdly don't have them (looks like this is early records)
sum(is.na(harris$def_spn)==T)
setkey(harris, def_spn, def_nam)
harris[is.na(def_spn), def_spn := 1:.N, by="def_nam"]

setkey(harris, def_spn, filedate)
require(zoo)
harris[, caseorder:= 1:.N, by= list(def_spn)] 
harris[caseorder==1, firstcase:=1]
harris[firstcase == 1, firstcasedate := fda]
harris[firstcase == 1, firstcaseno := cas]
harris[, firstcasedate:= na.locf(firstcasedate), by=def_spn]
harris[, firstcaseno:= na.locf(firstcaseno), by=def_spn]
firsttime <- harris[cas == firstcaseno | fda == firstcasedate, ]

dim(harris); dim(firsttime)
length(unique(firsttime$def_spn)) 
setkey(firsttime, "def_spn", "fda")

#drop any observations where a prior probation agreement was revoked or terminated--this all happens later.
firsttime2 <- firsttime[!(disposition %like% "TERMINATED") & !(disposition %like% "PROBATION REVOKED"), ]
dim(firsttime2); dim(firsttime)

firsttime <- firsttime2 

##########################################################################
# subset to time period
# note that the main analysis uses several different time periods (need to comment/uncomment code to generate the different files).
firsttime <- firsttime[fyear %in% c("2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012"), ]
#firsttime <- firsttime[fyear %in% c("2012", "2013", "2014"), ]

rm(harris)
firsttime[, jail:=0]
#clean up case outcomes
firsttime[sentence %like% "HCJ" & is.na(sentence)==F, jail:=1]
firsttime[sentence %like% "CONFINEMENT" & is.na(sentence)==F, jail:=1]
firsttime[sentence %like% "JAIL" & is.na(sentence)==F, jail:=1]
firsttime[sentence %like% "TDC" & is.na(sentence)==F, jail:=1]

firsttime[, fine:=0]
firsttime[sentence %like% "FINE" & is.na(sentence)==F, fine:=1]
library(stringr)
firsttime[, fineamount:= str_extract(sentence, "\\$([0-9])*")]
firsttime[, finenum := as.numeric(gsub("\\$", "", fineamount))]

firsttime[, probation:=0]
firsttime[sentence %like% "PROBATION" & is.na(sentence)==F, probation:=1]

firsttime[probation ==1, probationchunk :=grep(" PROBATION", unlist(strsplit(sentence, ", ")), value=T) ]
firsttime[probation ==1, probation1 := gsub(" PROBATION", "", probationchunk)]
firsttime$probationtime <- as.numeric(lapply(strsplit(firsttime$probation1, " "), "[", 1))
firsttime$probationtimeunit <- as.character(lapply(strsplit(firsttime$probation1, " "), "[", 2))

firsttime[probationtimeunit %like% "DAY", probationtimeadj := probationtime]
firsttime[probationtimeunit %like% "YEAR", probationtimeadj := 365*probationtime] #difftime doesn't recognize these as units, so give them something.
firsttime[probationtimeunit %like% "MONTH", probationtimeadj := 30*probationtime]

class(firsttime$probationtimeadj)
firsttime[is.na(probationtimeadj) ==F, probationlength:= as.difftime(probationtimeadj, units="days")]
firsttime[is.na(probationlength)==F, probationdays := as.numeric(probationlength, units="days")]

#drop some extraneous stuff
firsttime[, probationchunk := NULL]
firsttime[, probation1 := NULL]
firsttime[, probationtime := NULL]
firsttime[, probationtimeunit := NULL]
firsttime[, probationtimeadj := NULL]

firsttime$jail1 <- lapply(strsplit(firsttime$sentence, " HCJ"), "[", 1)
firsttime$jail2 <- lapply(strsplit(firsttime$sentence, " CONFINEMENT"), "[", 1)
firsttime$jail3 <- lapply(strsplit(firsttime$sentence, " STATE JAIL"), "[", 1)
firsttime$jail4 <- lapply(strsplit(firsttime$sentence, " TDC"), "[", 1)

firsttime[sentence %like% "HCJ" & is.na(sentence)==F, jail1a := unlist(jail1)]
firsttime[sentence %like% "CONFINEMENT" & is.na(sentence)==F, jail2a := unlist(jail2)]
firsttime[sentence %like% "STATE JAIL" & is.na(sentence)==F, jail3a := unlist(jail3)]
firsttime[sentence %like% "TDC" & is.na(sentence)==F, jail4a := unlist(jail4)]

firsttime[is.na(jail1a)==F, jailtest := jail1a]
firsttime[is.na(jail2a)==F, jailtest := jail2a]
firsttime[is.na(jail3a)==F, jailtest := jail3a]
firsttime[is.na(jail4a)==F, jailtest := jail4a]
firsttime$jailtime <- as.numeric(lapply(strsplit(firsttime$jailtest, " "), "[", 1))
firsttime$jailtimeunit <- lapply(strsplit(firsttime$jailtest, " "), "[", 2)

firsttime[jailtimeunit %like% "DAY", jailtimeadj := jailtime]
firsttime[jailtimeunit %like% "YEAR", jailtimeadj := 365*jailtime] 
firsttime[jailtimeunit %like% "MONTH", jailtimeadj := 30*jailtime]

firsttime[is.na(jailtimeadj) ==F, sentencelength:= as.difftime(jailtimeadj, units="days")]
firsttime[is.na(sentencelength)==F, sentencedays := as.numeric(sentencelength, units="days")]

firsttime[, jail1:=NULL]
firsttime[, jail1a:=NULL]
firsttime[, jail2:=NULL]
firsttime[, jail2a:=NULL]
firsttime[, jail3:=NULL]
firsttime[, jail3a:=NULL]
firsttime[, jail4:=NULL]
firsttime[, jail4a:=NULL]
firsttime[, jailtest:=NULL]
firsttime[, jailtimeadj:=NULL]
firsttime[, jailtimeunit:=NULL] 

firsttime[, nonconv:= 0]
firsttime[disposition %like% "ACQUITTAL", nonconv:= 1]
firsttime[disposition %like% "DISMISSED", nonconv:= 1]
firsttime[disposition %like% "DISM OTHER", nonconv:= 1]

firsttime[, birthdate := ymd(def_dob)]
firsttime[, def_yob := substr(def_dob, 0, 4)]
firsttime[, ageatfile := filedate-birthdate]

dim(firsttime); length(unique(firsttime$def_spn))
#collapse multiple charges from same date into one observation.  (They'll have gone through the same court assignment mechanism)
#sum jail/probation time, fines; mark most severe charge class, take minimum of binary vars (jail, fine, probation)

firsttime[com_l_d == "F1", chargetype:=1]
firsttime[com_l_d == "F2", chargetype:=2]
firsttime[com_l_d == "F3", chargetype:=3]
firsttime[com_l_d == "FS", chargetype:=4]
firsttime[com_l_d == "MA", chargetype:=5]
firsttime[com_l_d == "MB", chargetype:=6]
firsttime[com_l_d == "MC", chargetype:=7]
firsttime[com_l_d == "M", chargetype:=8]

firsttime[,casesnum := 1]
#also (for people with only one case), pull through the actual charge
firsttime[firstcase == 1,curr_off_lit_1 := curr_off_lit]
firsttime[firstcase == 1,com_off_lit_1 := com_off_lit]
 
setkey(firsttime, "def_spn", "chargetype") 

collapsedfirsttime <- firsttime[, list(totalsentencedays = sum(sentencedays, na.rm=T), 
	anyjail = max(jail), 
	anyfine = max(fine), 
	anyprobation = max(probation), 
	totalprobationdays = sum(probationdays, na.rm=T),
	totalfineamt = sum(finenum, na.rm=T),

	anyconv = 1-min(nonconv), 
	mostsevcharge = min(chargetype),
	numcases = sum(casesnum),
	curr_off_lit_1 = max(curr_off_lit_1),
	com_off_lit_1 = max(com_off_lit_1),
	disposition = max(disposition),
	def_nam = sample(def_nam,1) #not sure which is right in the cases where they inexplicably differ (only a very few).
), by=c("def_spn","fyear", "firstcasedate", "crt", "def_stnum", "def_stnam", "def_cty",  "def_st", "def_zip","def_sex" , "def_yob", "def_rac",  "def_dob", "ageatfile", "firstcasedate")]
dim(collapsedfirsttime); length(unique(firsttime$def_spn)) #should be the same.


save(collapsedfirsttime, file="harrisfirsttime200012.Rdata")
#save(collapsedfirsttime, file="harrisfirsttime201214.Rdata")

###########################################################################################################################
## Next, here's the code used to prepare these files for geocoding in Arc
rm(list=ls())

load("harrisfirsttime20122014.Rdata")
collapsedfirsttime$fullstreet<- paste(collapsedfirsttime$def_stnum, collapsedfirsttime$def_stnam, sep=" ")
collapsedfirsttime[def_zip ==0| def_zip==77000, def_zip:= NA]; length(collapsedfirsttime$def_zip[is.na(collapsedfirsttime$def_zip)==T])
TXonly <- collapsedfirsttime[def_st=="TX",]; dim(collapsedfirsttime); dim(TXonly)
write.csv(TXonly, file="harrisfirsttime_1214_forgeocoding.csv")

rm(collapsedfirsttime)
load("harrisfirsttime200012.Rdata")
collapsedfirsttime$fullstreet<- paste(collapsedfirsttime$def_stnum, collapsedfirsttime$def_stnam, sep=" ")
length(collapsedfirsttime$def_zip[is.na(collapsedfirsttime$def_zip)==T]) #but others should be missing: "0" or "77000" are not valid.
collapsedfirsttime[def_zip ==0 | def_zip==77000, def_zip:= NA]; length(collapsedfirsttime$def_zip[is.na(collapsedfirsttime$def_zip)==T])
TXonly <- collapsedfirsttime[def_st=="TX",]; dim(collapsedfirsttime); dim(TXonly)
write.csv(TXonly, file="harrisfirsttime_0012_forgeocoding.csv")


### and do the same sort of preparation for the old (2012, acquired in 2013) Harris voter file

load("Harrisco_voters_old.RData")
#wait, sort it first so we can drop any null addresses
setkey(harris, residential_address1,residential_city,residential_state, residential_zip5)
tab <- sort(table(harris$residential_address1)); head(tab); tail(tab) #does look like everyone has addresses.
setkey(harris,residential_state,residential_city, residential_zip5, residential_address1) #presort for efficiency instead
setkey(harris,residential_state, residential_zip5,residential_city, residential_address1, state_file_id) #presort for efficiency instead
#also drop any (inexplicable) duplicates
dim(harris)
harris <- unique(harris) #using the key
dim(harris)

#this takes too long to run/keeps crashing:strip out some of the extraneous columns and merge them back in later
trim <- subset(harris, select=c("nbec_guid","state_file_id", "residential_address1", "residential_address2", "residential_city", "residential_state", "residential_zip5"))
write.csv(trim, file="harrisvoters2013_trimmedforArc.csv")

#and split in half
length1 <- 1000000 #round(nrow(trim)/2) 
trim1 <- trim[1:length1,]
trim2<- trim[(length1+1):nrow(trim),]
nrow(trim1)+nrow(trim2); nrow(trim)
write.csv(trim1, file="harrisvoters2013_trimmedforArc_pt1.csv")
write.csv(trim2, file="harrisvoters2013_trimmedforArc_pt2.csv")
dim(trim1); dim(trim2)

###########################################################################################################################
## Then, geocoded the above in Arc (as discussed in SI), outputted as shapefiles, and now pull it back in here and tidy up:

rm(list=ls())
setwd("/home/ariel/Dropbox/Texas/neighbors")
library(foreign)
library(data.table)
library(maps)
library(sp)
library(maptools)
library(rgdal)

##first the 2012-2014 defendants
ogrInfo(".", "harrisdef_TX201214_arcgeo")
geodef12<-readOGR(dsn="/home/ariel/Dropbox (MIT)/Texas/neighbors",layer="harrisdef_TX201214_arcgeo")
dim(geodef12); class(geodef12)
geodef12flat <- as.data.frame(geodef12) #de-spatial it
save(geodef12flat, file="Harrisdefendants20122014_Arcgeocoded.Rdata") #this is one of the main files we'll pull in in "Mainmerge_spring2018.R"
proj4string(geodef12) 

## and now the full 2000-2012 defendants file
rm(list=ls())
setwd("/home/ariel/Dropbox/Texas/neighbors")
library(foreign)
library(data.table)
library(maps)
library(sp)

library(maptools)
library(rgdal)
ogrInfo(".", "harrisdefs0012_arcgeo")
geodef12<-readOGR(dsn="/home/ariel/Dropbox/Texas/neighbors",layer="harrisdefs0012_arcgeo")
dim(geodef12); class(geodef12)
geodef12flat <- as.data.frame(geodef12) #de-spatial it
save(geodef12flat, file="Harrisdefendants20002012_Arcgeocoded.Rdata") #this is one of the main files we'll pull in in "Mainmerge_spring2018.R"

#and now the older (2013) voter file
rm(list=ls())
setwd("/home/ariel/Dropbox (MIT)/Texas/neighbors")
library(foreign)
library(data.table)
library(maps)
library(sp)
library(maptools)
library(rgdal)

ogrInfo(".", "harrisvoters2013_Arcgeocoded_pt1")
geo2<-readOGR(dsn="/home/ariel/Dropbox (MIT)/Texas/neighbors",layer="harrisvoters2013_Arcgeocoded_pt2")
geo1<-readOGR(dsn="/home/ariel/Dropbox (MIT)/Texas/neighbors",layer="harrisvoters2013_Arcgeocoded_pt1")
dim(geo2); class(geo2)
dim(geo1); class(geo1)
nrow(geo2)+nrow(geo1) # GOOD.

geo1flat <-as.data.frame(geo1); dim(geo1flat)
geo2flat <-as.data.frame(geo2); dim(geo2flat) 

geovoters <- spRbind(geo1, geo2) #stick the two halves together.
dim(geovoters) 

geovotersflat <- as.data.frame(geovoters) #de-spatial it
#now merge it back to the voter file cols.

load("../Voterfile/Harrisco_voters_old.RData")
library(data.table)
setkey(harris,residential_state, residential_zip5,residential_city, residential_address1, state_file_id)
dim(harris)
harris <- unique(harris) 
dim(harris)
setkey(harris, state_file_id)

fullfile <- merge(harris, geovotersflat, by.x="state_file_id", by.y="state_file", all.x=T)
dim(fullfile);  dim(harris); dim(geovotersflat) 

proj4string(geo1) #projection (use this later):  +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0

save(fullfile, file="Harrisvoters2013_Arcgeocoded.Rdata") #and this is what gets used (as well as the above defendant files) by "Mainmerge_spring2018.R"





